{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pandas数据统计函数\n",
    "\n",
    "1.汇总类统计<br>\n",
    "2.唯一去重和按值计数<br>\n",
    "3.相关系数和协方差"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 0. 读取csv数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "fpath=\"../datas/weather_20230115134249.csv\"\n",
    "df = pd.read_csv(fpath)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>日期</th>\n",
       "      <th>城市</th>\n",
       "      <th>行政区</th>\n",
       "      <th>观测站</th>\n",
       "      <th>气温(度)</th>\n",
       "      <th>相对湿度(%)</th>\n",
       "      <th>累积雨量(mm)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2015-01-01</td>\n",
       "      <td>新北市</td>\n",
       "      <td>烏來區</td>\n",
       "      <td>福山</td>\n",
       "      <td>13.7℃</td>\n",
       "      <td>92</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2015-01-02</td>\n",
       "      <td>臺南市</td>\n",
       "      <td>安平區</td>\n",
       "      <td>安平</td>\n",
       "      <td>23.5℃</td>\n",
       "      <td>70</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2015-01-03</td>\n",
       "      <td>臺東縣</td>\n",
       "      <td>東河鄉</td>\n",
       "      <td>七塊厝</td>\n",
       "      <td>19.6℃</td>\n",
       "      <td>86</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           日期   城市  行政区  观测站  气温(度)  相对湿度(%)  累积雨量(mm)\n",
       "0  2015-01-01  新北市  烏來區   福山  13.7℃       92       0.0\n",
       "1  2015-01-02  臺南市  安平區   安平  23.5℃       70       0.0\n",
       "2  2015-01-03  臺東縣  東河鄉  七塊厝  19.6℃       86       0.0"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 替换温度的后缀℃\n",
    "df.loc[:,\"气温(度)\"] = df[\"气温(度)\"].str.replace(\"℃\",\"\").astype(\"float\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>日期</th>\n",
       "      <th>城市</th>\n",
       "      <th>行政区</th>\n",
       "      <th>观测站</th>\n",
       "      <th>气温(度)</th>\n",
       "      <th>相对湿度(%)</th>\n",
       "      <th>累积雨量(mm)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2015-01-01</td>\n",
       "      <td>新北市</td>\n",
       "      <td>烏來區</td>\n",
       "      <td>福山</td>\n",
       "      <td>13.7</td>\n",
       "      <td>92</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2015-01-02</td>\n",
       "      <td>臺南市</td>\n",
       "      <td>安平區</td>\n",
       "      <td>安平</td>\n",
       "      <td>23.5</td>\n",
       "      <td>70</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2015-01-03</td>\n",
       "      <td>臺東縣</td>\n",
       "      <td>東河鄉</td>\n",
       "      <td>七塊厝</td>\n",
       "      <td>19.6</td>\n",
       "      <td>86</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           日期   城市  行政区  观测站  气温(度)  相对湿度(%)  累积雨量(mm)\n",
       "0  2015-01-01  新北市  烏來區   福山   13.7       92       0.0\n",
       "1  2015-01-02  臺南市  安平區   安平   23.5       70       0.0\n",
       "2  2015-01-03  臺東縣  東河鄉  七塊厝   19.6       86       0.0"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 汇总类统计"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>气温(度)</th>\n",
       "      <th>相对湿度(%)</th>\n",
       "      <th>累积雨量(mm)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>484.000000</td>\n",
       "      <td>484.000000</td>\n",
       "      <td>484.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>16.736364</td>\n",
       "      <td>-190.592975</td>\n",
       "      <td>-8.199380</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>18.351663</td>\n",
       "      <td>1614.803969</td>\n",
       "      <td>31.338267</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>-99.000000</td>\n",
       "      <td>-9900.000000</td>\n",
       "      <td>-99.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>15.275000</td>\n",
       "      <td>66.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>19.350000</td>\n",
       "      <td>77.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>23.425000</td>\n",
       "      <td>88.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>30.600000</td>\n",
       "      <td>100.000000</td>\n",
       "      <td>87.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            气温(度)      相对湿度(%)    累积雨量(mm)\n",
       "count  484.000000   484.000000  484.000000\n",
       "mean    16.736364  -190.592975   -8.199380\n",
       "std     18.351663  1614.803969   31.338267\n",
       "min    -99.000000 -9900.000000  -99.000000\n",
       "25%     15.275000    66.000000    0.000000\n",
       "50%     19.350000    77.000000    0.000000\n",
       "75%     23.425000    88.000000    0.000000\n",
       "max     30.600000   100.000000   87.000000"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 提取所有数字列统计结果\n",
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "16.73636363636366"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看单个Series的数据\n",
    "df[\"气温(度)\"].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "30.6"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 最高温\n",
    "df[\"气温(度)\"].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-99.0"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 最低温\n",
    "df[\"气温(度)\"].min()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. 唯一去重和按值计数"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.1 唯一性去重\n",
    "一般不用于数值列，而是枚举、分类列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<bound method Series.unique of 0      新北市\n",
       "1      臺南市\n",
       "2      臺東縣\n",
       "3      新北市\n",
       "4      南投縣\n",
       "5      嘉義縣\n",
       "6      花蓮縣\n",
       "7      嘉義市\n",
       "8      宜蘭縣\n",
       "9      新北市\n",
       "10     彰化縣\n",
       "11     雲林縣\n",
       "12     南投縣\n",
       "13     臺東縣\n",
       "14     屏東縣\n",
       "15     苗栗縣\n",
       "16     新北市\n",
       "17     苗栗縣\n",
       "18     臺南市\n",
       "19     屏東縣\n",
       "20     新北市\n",
       "21     宜蘭縣\n",
       "22     彰化縣\n",
       "23     臺中市\n",
       "24     臺中市\n",
       "25     臺東縣\n",
       "26     嘉義縣\n",
       "27     臺南市\n",
       "28     屏東縣\n",
       "29     宜蘭縣\n",
       "      ... \n",
       "454    臺南市\n",
       "455    宜蘭縣\n",
       "456    臺東縣\n",
       "457    雲林縣\n",
       "458    屏東縣\n",
       "459    屏東縣\n",
       "460    嘉義縣\n",
       "461    南投縣\n",
       "462    高雄市\n",
       "463    桃園市\n",
       "464    新北市\n",
       "465    屏東縣\n",
       "466    臺東縣\n",
       "467    南投縣\n",
       "468    臺南市\n",
       "469    苗栗縣\n",
       "470    新北市\n",
       "471    屏東縣\n",
       "472    屏東縣\n",
       "473    臺南市\n",
       "474    新竹縣\n",
       "475    新北市\n",
       "476    屏東縣\n",
       "477    新北市\n",
       "478    高雄市\n",
       "479    新北市\n",
       "480    宜蘭縣\n",
       "481    花蓮縣\n",
       "482    新竹縣\n",
       "483    桃園市\n",
       "Name: 城市, Length: 484, dtype: object>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"城市\"].unique"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.2 按值计数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-99.0    11\n",
       " 17.4     7\n",
       " 19.4     7\n",
       " 13.5     7\n",
       " 23.2     6\n",
       " 23.8     6\n",
       " 16.5     6\n",
       " 25.1     6\n",
       " 23.9     6\n",
       " 16.3     6\n",
       " 25.2     6\n",
       " 22.0     5\n",
       " 19.3     5\n",
       " 14.8     5\n",
       " 14.1     5\n",
       " 13.6     5\n",
       " 15.6     5\n",
       " 19.5     5\n",
       " 13.4     5\n",
       " 17.0     5\n",
       " 23.7     5\n",
       " 21.7     4\n",
       " 21.4     4\n",
       " 24.6     4\n",
       " 15.3     4\n",
       " 20.9     4\n",
       " 17.6     4\n",
       " 24.3     4\n",
       " 15.2     4\n",
       " 22.1     4\n",
       "         ..\n",
       " 18.0     1\n",
       " 28.9     1\n",
       " 26.8     1\n",
       " 22.9     1\n",
       " 13.2     1\n",
       " 27.9     1\n",
       " 5.1      1\n",
       " 27.8     1\n",
       " 11.3     1\n",
       " 26.6     1\n",
       " 12.2     1\n",
       " 29.2     1\n",
       " 29.3     1\n",
       " 9.7      1\n",
       " 19.7     1\n",
       " 18.9     1\n",
       " 6.5      1\n",
       " 27.6     1\n",
       " 21.9     1\n",
       " 13.0     1\n",
       " 2.0      1\n",
       " 29.6     1\n",
       " 20.0     1\n",
       " 14.0     1\n",
       " 21.5     1\n",
       " 24.0     1\n",
       " 10.4     1\n",
       " 11.2     1\n",
       " 10.2     1\n",
       " 12.7     1\n",
       "Name: 气温(度), Length: 179, dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"气温(度)\"].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. 相关系数和协方差\n",
    "用途：\n",
    "* 两只股票，是不是同涨同跌？程度多大？正相关还是负相关？\n",
    "* 产品销量的波动，跟哪些因素正相关、负相关，程度有多大？\n",
    "\n",
    "来自知乎，对于两个变量X,Y:\n",
    "1. 协方差：**衡量同反向程度**，如果协方差为正，说明X,Y同向运动，协方差越大说明同向程度越高；如果协方差为负，说明X,Y反向运动，协方差越小说明反向程度越高。<br>\n",
    "2. 相关系数：**衡量相似度程度**，当他们的相关系数为1时，说明两个变量变化时的正向相似度最大，当相关系数为-1时，说明两个两个变量的反向相似度最大。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>气温(度)</th>\n",
       "      <th>相对湿度(%)</th>\n",
       "      <th>累积雨量(mm)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>气温(度)</th>\n",
       "      <td>336.783520</td>\n",
       "      <td>2.621960e+04</td>\n",
       "      <td>125.126830</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>相对湿度(%)</th>\n",
       "      <td>26219.596142</td>\n",
       "      <td>2.607592e+06</td>\n",
       "      <td>11602.318381</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>累积雨量(mm)</th>\n",
       "      <td>125.126830</td>\n",
       "      <td>1.160232e+04</td>\n",
       "      <td>982.086977</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 气温(度)       相对湿度(%)      累积雨量(mm)\n",
       "气温(度)       336.783520  2.621960e+04    125.126830\n",
       "相对湿度(%)   26219.596142  2.607592e+06  11602.318381\n",
       "累积雨量(mm)    125.126830  1.160232e+04    982.086977"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 协方差矩阵：\n",
    "df.cov()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>气温(度)</th>\n",
       "      <th>相对湿度(%)</th>\n",
       "      <th>累积雨量(mm)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>气温(度)</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.884771</td>\n",
       "      <td>0.217571</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>相对湿度(%)</th>\n",
       "      <td>0.884771</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.229271</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>累积雨量(mm)</th>\n",
       "      <td>0.217571</td>\n",
       "      <td>0.229271</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             气温(度)   相对湿度(%)  累积雨量(mm)\n",
       "气温(度)     1.000000  0.884771  0.217571\n",
       "相对湿度(%)   0.884771  1.000000  0.229271\n",
       "累积雨量(mm)  0.217571  0.229271  1.000000"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 相关系数矩阵：\n",
    "df.corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8847707858193214"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 单独查看湿度和温度的相关系数\n",
    "df[\"相对湿度(%)\"].corr(df[\"气温(度)\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ！！这就是特征工程对于机器学习重要性的一个例子"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
